Load libraries

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tinytable))
suppressPackageStartupMessages(library(rairtable))
suppressPackageStartupMessages(library(lme4))
suppressPackageStartupMessages(library(broom))

Load data

Sample metadata

sample_metadata <- read_tsv(paste0("data/metadata.tsv")) %>%
    mutate(Extraction=factor(Extraction, levels=c("ZYMO","DREX","EHEX"))) %>%
    rename(dataset=Dataset)
extraction_data <- airtable("tblBcTZcRG1E9wsGO", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("EX DNA ng","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,extract=2,dataset=3) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  select(dataset,extract)
library_data <- airtable("tblo6AuYpxbbGw9gh", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("Required PCR cycles","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,pcr=2,dataset=3) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  select(dataset,pcr)
indexing_data <- airtable("tblhfsiR4NI9XJQG0", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("Adaptors (nM)","Library (nM)","Datasets_flat"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,adaptors=2,library=3,dataset=4) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  mutate(ratio=library/(adaptors+library)) %>%
  select(dataset,adaptors,library,ratio)
preprocessing_data <- airtable("tblJfLRU2FIVz37Y1", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("bases_pre_fastp","bases_post_fastp","host_bases","metagenomic_bases","singlem_fraction","C","EHI_plaintext"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,nonpareil=C,dataset=EHI_plaintext,microbial_fraction=singlem_fraction) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  select(dataset,bases_pre_fastp,bases_post_fastp,bases_post_fastp,metagenomic_bases,microbial_fraction,nonpareil)
assembly_data <- airtable("tblG6ZIvkYN844I97", "appQpr6MxnaiVHsHy") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("EHI_number_api","assembly_length","N50","L50","num_contigs","num_bins","metagenomic_bases"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,assembly_mapped_bases=metagenomic_bases,dataset=EHI_number_api) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  select(dataset,assembly_length,N50,L50,num_contigs,num_bins,assembly_mapped_bases)
mapping_data <- airtable("tblWDyQmM9rQ9wq57", "appWbHBNLE6iAsMRV") %>% #get base ID from Airtable browser URL
  read_airtable(., fields = c("EHI_sample_static","MAG_mapping_percentage"), id_to_col = TRUE) %>% #get 3 columns from MAGs table
  rename(id=1,dataset=EHI_sample_static) %>%
  filter(dataset %in% sample_metadata$dataset) %>%
  select(dataset,MAG_mapping_percentage)
all_data <- sample_metadata %>%
    mutate(Taxon=factor(Taxon,levels=c("Amphibian","Bird","Mammal","Reptile","Control"))) %>%
    left_join(extraction_data, by=join_by(dataset==dataset)) %>%
    left_join(library_data, by=join_by(dataset==dataset)) %>%
    left_join(indexing_data, by=join_by(dataset==dataset)) %>%
    left_join(preprocessing_data, by=join_by(dataset==dataset)) %>%
    left_join(assembly_data, by=join_by(dataset==dataset)) %>%
    left_join(mapping_data, by=join_by(dataset==dataset))

Laboratory performance differences

DNA yield

Total amount of DNA extracted from the 150 ul subset of the bead-beaten sample.

all_data %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.0f±%.0f", mean(extract), sd(extract))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of total DNA nanograms")
tinytable_ue8qb0yoydkr14a5dbb8
Mean and standard deviation of total DNA nanograms
Taxon ZYMO DREX EHEX
Amphibian 297±350 340±475 1138±1195
Bird 71±134 11±11 44±45
Mammal 448±432 256±224 867±730
Reptile 102±71 162±134 250±179
Control 0±0 2±3 1±0
all_data %>%
    ggplot(aes(x=Extraction,y=extract))+ 
        geom_boxplot() + 
        facet_grid(. ~ Taxon, scales = "free") +
        labs(y="DNA yield (ng)",x="Extraction method")

all_data  %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(extract ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_7nole6vkmqt2gfbkwf0h
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 229.412625000 129.0917 1.7771296 18.62337 0.091883533
fixed NA ExtractionDREX -37.448812500 100.5929 -0.3722808 59.99998 0.710995523
fixed NA ExtractionEHEX 345.333000000 100.5929 3.4329752 59.99998 0.001087593
ran_pars Sample sd__(Intercept) 0.003841839 NA NA NA NA
ran_pars Species sd__(Intercept) 373.178642666 NA NA NA NA
ran_pars Residual sd__Observation 348.464099979 NA NA NA NA

Library performance

Number of PCR cycles required for reaching the plateau phase of the indexing PCR.

all_data %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(pcr), sd(pcr))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of optimal number of PCR cycles")
tinytable_7f07x5lcbs3dq7s5ttys
Mean and standard deviation of optimal number of PCR cycles
Taxon ZYMO DREX EHEX
Amphibian 14.2±2.1 10.8±3.3 10.2±2.6
Bird 18.0±2.5 18.5±3.9 14.8±2.6
Mammal 11.0±4.0 10.2±1.9 10.3±2.2
Reptile 11.7±4.2 10.3±3.7 12.0±3.5
Control 20.0±2.8 19.8±0.5 22.0±4.2
all_data %>%
    ggplot(aes(x=Extraction,y=pcr))+ 
        geom_boxplot() + 
        facet_grid(. ~ Taxon, scales = "free") +
        labs(y="Optimal number of PCR cycles",x="Extraction method")

all_data  %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(pcr ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_k63v5m3l5w9eub76hblr
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 13.708333 0.9696194 14.137850 22.50455 1.117002e-12
fixed NA ExtractionDREX -1.250000 0.8896915 -1.404981 60.00000 1.651832e-01
fixed NA ExtractionEHEX -1.875000 0.8896915 -2.107472 60.00000 3.926431e-02
ran_pars Sample sd__(Intercept) 0.000000 NA NA NA NA
ran_pars Species sd__(Intercept) 2.555902 NA NA NA NA
ran_pars Residual sd__Observation 3.081982 NA NA NA NA

Metagenomic data

Library complexity

Nonpareil estimate of the metagenomic complexity after removing host DNA.

all_data %>%
    group_by(Taxon,Extraction) %>%
    summarise(value = sprintf("%.1f±%.1f", mean(nonpareil), sd(nonpareil))) %>%
    pivot_wider(names_from = Extraction, values_from = value) %>%
    tt(caption = "Mean and standard deviation of extimated metagenomic completeness")
tinytable_9mb7zhfbsrpwvgph2hxj
Mean and standard deviation of extimated metagenomic completeness
Taxon ZYMO DREX EHEX
Amphibian 0.9±0.1 0.8±0.1 0.8±0.1
Bird 0.9±0.1 1.0±0.0 0.8±0.4
Mammal 0.8±0.2 0.8±0.1 0.7±0.3
Reptile 0.9±0.1 0.9±0.0 0.9±0.1
Control 0.0±0.0 0.5±0.6 0.5±0.7
all_data %>%
    ggplot(aes(x=Extraction,y=nonpareil))+ 
        geom_boxplot() + 
        facet_grid(. ~ Taxon, scales = "free") +
        labs(y="Nonpareil completeness",x="Extraction method")

all_data  %>%
    filter(Taxon != "Control") %>%
    lmerTest::lmer(nonpareil ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_3k08sjaihu7ljj6u8h9o
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 0.885552853 0.03450229 25.6664948 40.89542 7.753071e-27
fixed NA ExtractionDREX 0.005536892 0.04336611 0.1276779 48.00020 8.989373e-01
fixed NA ExtractionEHEX -0.112193866 0.04336611 -2.5871326 48.00020 1.276294e-02
ran_pars Sample sd__(Intercept) 0.059565487 NA NA NA NA
ran_pars Species sd__(Intercept) 0.035030813 NA NA NA NA
ran_pars Residual sd__Observation 0.150224598 NA NA NA NA

Alpha diversity

alpha_data <- list.files(path = "results", pattern = "^alpha_.*\\.tsv$", full.names = TRUE) %>%
  map_df(~ read_tsv(.)) %>%
  left_join(sample_metadata,by= join_by(dataset==dataset))
alpha_data %>%
    pivot_longer(!c(dataset,Library,Species,Taxon,Sample,Extraction), names_to = "metric", values_to = "value") %>%
    mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
    ggplot(aes(x=Extraction,y=value))+ 
        geom_boxplot() + 
        facet_grid(metric ~ Taxon, scales = "free")

Richness

alpha_data %>%
    lmerTest::lmer(richness ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_rte1mhc99b4iyzgiphds
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 58.43750 12.008400 4.8663854 8.87726 0.0009237868
fixed NA ExtractionDREX 2.18750 4.698926 0.4655319 32.00001 0.6447035294
fixed NA ExtractionEHEX 2.43750 4.698926 0.5187355 32.00001 0.6075141409
ran_pars Sample sd__(Intercept) 15.62413 NA NA NA NA
ran_pars Species sd__(Intercept) 30.71216 NA NA NA NA
ran_pars Residual sd__Observation 13.29057 NA NA NA NA

Neutral

alpha_data %>%
    lmerTest::lmer(neutral ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_x1zec4fn4gjmpotpyv7f
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 29.71528470 5.751141 5.16685008 8.745982 0.0006458923
fixed NA ExtractionDREX -1.59472963 2.085900 -0.76452835 31.999998 0.4501538272
fixed NA ExtractionEHEX -0.04935512 2.085900 -0.02366131 31.999998 0.9812697035
ran_pars Sample sd__(Intercept) 9.00571703 NA NA NA NA
ran_pars Species sd__(Intercept) 14.37531297 NA NA NA NA
ran_pars Residual sd__Observation 5.89981588 NA NA NA NA

Phylogenetic

alpha_data %>%
    lmerTest::lmer(phylogenetic ~ Extraction + (1 | Sample) + (1 | Species), data = ., REML = FALSE) %>%
    broom.mixed::tidy() %>%
    tt()
tinytable_smals393ahnbapa2r74b
effect group term estimate std.error statistic df p.value
fixed NA (Intercept) 4.5870074 0.4577012 10.021838 8.589824 4.977993e-06
fixed NA ExtractionDREX 0.2122362 0.1485377 1.428837 32.000064 1.627406e-01
fixed NA ExtractionEHEX 0.2230529 0.1485377 1.501658 32.000064 1.429897e-01
ran_pars Sample sd__(Intercept) 1.1278928 NA NA NA NA
ran_pars Species sd__(Intercept) 0.9754991 NA NA NA NA
ran_pars Residual sd__Observation 0.4201282 NA NA NA NA

Variance partitioning

variance_data <- list.files(path = "results", pattern = "^var_.*\\.tsv$", full.names = TRUE) %>%
  map_df(~ read_tsv(.))
variance_data %>%
    left_join(sample_metadata %>% select(Species,Taxon) %>% unique(),by=join_by(species==Species)) %>%
    mutate(metric=factor(metric,levels=c("richness","neutral","phylogenetic"))) %>%
    ggplot(aes(x=Taxon,y=r2)) +
    geom_boxplot()+
    ylim(0,1)+
    facet_grid(. ~ metric, scales = "free")

variance_data %>%
    group_by(metric) %>%
    summarise(mean=mean(r2),sd=sd(r2)) %>%
    tt()
tinytable_39h2darcau1obihpnfyt
metric mean sd
neutral 0.06201935 0.04744380
phylogenetic 0.07381277 0.06884687
richness 0.16970818 0.06626388